The VDJ is integrated to the Seurat object using the metadata slot. This permits the VDJ features to be used in plotting and other Seurat functions such as finding gene markers.
The RScript for this workflow is available on github in the workshop respository.
We will require Seurat to interact with the Seurat object generated from processing of the gene expression data. We will load the Seurat objects for the datasets from RDS. Once again will be be using the tidyverse packages for data manipulation, ggpubr for plot layouts and also adding stats to panels, ggsci for colour palettes and rstatix for generating ggpubr/ggplot2 compatible data for showing statistics on panels.
#installing R packages
#check if the package is already installed, if not, install it
if (sum(grepl("tidyverse", rownames(installed.packages()))) > 0) {
print("tidyverse is already installed")
} else {
install.packages("tidyverse")
}
## [1] "tidyverse is already installed"
if (sum(grepl("Seurat", rownames(installed.packages()))) > 0) {
print("Seurat is already installed")
} else {
install.packages("Seurat")
}
## [1] "Seurat is already installed"
if (sum(grepl("ggpubr", rownames(installed.packages()))) > 0) {
print("ggpubr is already installed")
} else {
install.packages("ggpubr")
}
## [1] "ggpubr is already installed"
if (sum(grepl("ggsci", rownames(installed.packages()))) > 0) {
print("ggsci is already installed")
} else {
install.packages("ggsci")
}
## [1] "ggsci is already installed"
if (sum(grepl("rstatix", rownames(installed.packages()))) > 0) {
print("rstatix is already installed")
} else {
install.packages("rstatix")
}
## [1] "rstatix is already installed"
#loading R packages
#tidyverse packages for data manipulation and plotting
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
#in order to work with the seurat objects from the scRNA-seq
library(Seurat)
## Loading required package: SeuratObject
## Loading required package: sp
## The legacy packages maptools, rgdal, and rgeos, underpinning the sp package,
## which was just loaded, will retire in October 2023.
## Please refer to R-spatial evolution reports for details, especially
## https://r-spatial.org/r/2023/05/15/evolution4.html.
## It may be desirable to make the sf package available;
## package maintainers should consider adding sf to Suggests:.
## The sp package is now running under evolution status 2
## (status 2 uses the sf package in place of rgdal)
##
## Attaching package: 'SeuratObject'
##
## The following objects are masked from 'package:base':
##
## intersect, saveRDS
##
## Loading Seurat v5 beta version
## To maintain compatibility with previous workflows, new Seurat objects will use the previous object structure by default
## To use new Seurat v5 assays: Please run: options(Seurat.object.assay.version = 'v5')
#multi-panel plots
library(ggpubr)
#colour scales for scientific journals
library(ggsci)
#stats package for ggplot2
library(rstatix)
##
## Attaching package: 'rstatix'
##
## The following object is masked from 'package:stats':
##
## filter
#themes for plots
theme_custom <- function (base_size = 12) {
theme_bw(base_size=base_size) %+replace%
theme(panel.grid = element_blank(),
panel.border = element_blank(),
axis.line = element_line(colour="black"),
axis.text.x = element_text(colour="black", size=base_size),
axis.text.y = element_text(colour="black", size = base_size, hjust = 1),
axis.title = element_text(colour="black", face="bold", size = base_size),
strip.background = element_blank(),
strip.text = element_text(colour="black", face="bold", size=base_size),
legend.text = element_text(colour="black", size=base_size)
)
}
theme_x_rotated <- function (base_size = 12) {
theme_custom(base_size=base_size) %+replace%
theme(axis.text.x = element_text(colour="black", angle = 90, hjust = 1, vjust = 0.5, size=base_size))
}
For each dataset - PBMC and tumour - we need to load the B and T cell VDJ data that we have previously summarised at the cell barcode level and the Seurat objects from the GEX analysis.
For the T cell data we load the tab-delimited file output from joining the cellranger and IgBLAST data, but for the B cells we will use the tab-delimited file from the clone calling.
These files are available from the github repo but have also been pre-loaded into Posit.cloud:
#loading VDJ data
## loading form git hub
#b.data <- read_tsv("https://raw.githubusercontent.com/kjlj/scRNA-seq_VDJ/main/data/pbmc-tumour_Ig_cellranger_igblast_per-barcode_clns.tsv")
#t.data <- read_tsv("https://raw.githubusercontent.com/kjlj/scRNA-seq_VDJ/main/data/pbmc-tumour_TR_cellranger_igblast_per-barcode.tsv")
## loading from local/posit.cloud
b.data <- read_tsv("../data/pbmc-tumour_Ig_cellranger_igblast_per-barcode_clns.tsv")
## Rows: 6626 Columns: 264
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (118): barcode, contig_id_igh, chain_cellranger_igh, v_gene_cellranger_i...
## dbl (107): length_cellranger_igh, reads_cellranger_igh, umis_cellranger_igh,...
## lgl (39): is_cell_cellranger_igh, high_confidence_cellranger_igh, full_leng...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
t.data <- read_tsv("../data/pbmc-tumour_TR_cellranger_igblast_per-barcode.tsv")
## Rows: 7383 Columns: 264
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: "\t"
## chr (119): barcode, contig_id_tra, chain_cellranger_tra, v_gene_cellranger_t...
## dbl (106): length_cellranger_tra, reads_cellranger_tra, umis_cellranger_tra,...
## lgl (39): is_cell_cellranger_tra, high_confidence_cellranger_tra, full_leng...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
#loading seurat objects
#loading from dropbox as large file need to extend the timeout
#options(timeout=600)
#pbmc.seurat.obj <- readRDS(url("https://www.dropbox.com/scl/fi/4s610vt1mgtmgibdvfsar/pbmc_seurat-without-VDJ-genes-azimuth.rds?rlkey=ftdkxi9mnxezhbb42dqftbqel&dl=1"))
#tumour.seurat.obj <- readRDS(url("https://www.dropbox.com/scl/fi/scik47zay4x27t4wxmo70/tumour_seurat-without-VDJ-genes-azimuth.rds?rlkey=z8ghoeoboaneniv82e2xywji3&dl=1"))
#loading from local/posit.cloud
pbmc.seurat.obj <- readRDS("../seurat_objects/pbmc_seurat-without-VDJ-genes-azimuth.rds")
tumour.seurat.obj <- readRDS("../seurat_objects/tumour_seurat-without-VDJ-genes-azimuth.rds")
Up until now we have treated the B and T VDJ data within the same dataset independently but they are now going to be united with the scRNA-seq data. This requires that for each dataset, each cell barcode has only a single entry - either B or T VDJ data. There shouldn’t be cells with both B and T VDJ for the same cell barcode, but doublets, which haven’t been filtered from the VDJ, can result in this artifact.
Cell barcodes that have both B and T VDJ data need to be identified so that they can be removed:
#check for any cell barcodes that have both B and T cell data, likely to represent doublets and should be filtered
dual.bt.summary <- bind_rows(b.data %>%
select(barcode, dataset) %>%
mutate(chain = "B"),
t.data %>%
select(barcode, dataset) %>%
mutate(chain = "T")) %>%
pivot_wider(id_cols = c("barcode", "dataset"),
names_from = "chain",
values_from = "chain",
values_fn = length) %>%
replace(is.na(.), 0) %>%
mutate(dual_cell = ifelse(B == 1 & T == 1, TRUE, FALSE))
#get count for the number of cells with both B and T cell data from each dataset
with(dual.bt.summary, table(dataset, dual_cell))
## dual_cell
## dataset FALSE TRUE
## pbmc 6116 96
## tumour 7365 168
Add the dual_cell column to the B and T data and filter
the cells that are TRUE:
b.data <- b.data %>%
left_join(dual.bt.summary %>% select(-c(T, B)), by = c("barcode", "dataset")) %>%
filter(dual_cell == FALSE)
table(b.data$dataset)
##
## pbmc tumour
## 886 5476
t.data <- t.data %>%
left_join(dual.bt.summary %>% select(-c(T, B)), by = c("barcode", "dataset")) %>%
filter(dual_cell == FALSE)
table(t.data$dataset)
##
## pbmc tumour
## 5230 1889
rm(dual.bt.summary)
There may also be VDJ cell barcodes for which there is no corresponding data in the scRNA-seq due to the cell being removed during filtering. We will drop VDJ cell barcodes that don’t have data in the GEX datasets.
To remove the cells, we need a list of the cell barcodes for the PBMC and tumour samples from the Seurat objects. This can be obtained from the rownames of the meta.data slot in each of the objects:
#get the barcodes for the PBMCs
pbmc.barcodes <- tibble(barcode = rownames(pbmc.seurat.obj@meta.data)) %>%
mutate(dataset = "pbmc",
in_gex = TRUE)
#get the barcodes for the tumour
tumour.barcodes <- tibble(barcode = rownames(tumour.seurat.obj@meta.data)) %>%
mutate(dataset = "tumour",
in_gex = TRUE)
#combine, the dataset col keeps things unique
barcodes <- bind_rows(pbmc.barcodes, tumour.barcodes)
#join with the B cell data
b.data <- b.data %>%
left_join(barcodes) %>%
mutate(in_gex = ifelse(is.na(in_gex), FALSE, TRUE))
## Joining with `by = join_by(barcode, dataset)`
#join with the T cell data
t.data <- t.data %>%
left_join(barcodes) %>%
mutate(in_gex = ifelse(is.na(in_gex), FALSE, TRUE))
## Joining with `by = join_by(barcode, dataset)`
#print the number of cell barcodes that overlap with the GEX
with(t.data, table(dataset, in_gex))
## in_gex
## dataset FALSE TRUE
## pbmc 162 5068
## tumour 183 1706
with(b.data, table(dataset, in_gex))
## in_gex
## dataset FALSE TRUE
## pbmc 46 840
## tumour 571 4905
#removing data no longer needed to save memory due to size of seurat objects
rm(list = c("pbmc.barcodes", "tumour.barcodes", "barcodes"))
There are some additional data fields that could be of interest for anlaysis that we can add to the B and T cell datasets.
For the B cells, remove the cells that aren’t in the GEX and add columns for paired chains, clone size, whether or not the cell is part of an expanded clone and percent of somatic hypermutation and the isotype:
#add extra data fields to the b cell data
b.data <- b.data %>%
filter(in_gex) %>% #remove VDJ for cells that aren't in the GEX
mutate(bcr_paired = ifelse(!(is.na(v_call_igblast_igh)) & !(is.na(v_call_igblast_igkl)), TRUE, FALSE)) %>% #add a column to indicate if the b cells include paired chains
group_by(dataset, clone_id, unq_clone_id) %>% #group cells by their clone ids within each dataset in order to add columns for # cells in a clone and whether or not a clone is expanded
mutate(cln_size = length(barcode)) %>% #add the size for each clone, using mutate rather than summarise
ungroup() %>% #remove the grouping
mutate(b_expanded_cln = ifelse(cln_size == 1, "singleton", "expanded")) %>% #add a column indicating if the clone is expanded based on the cell cnt for the clone
mutate(vh_mut = ifelse(is.na(v_identity_igblast_igh), NA_integer_, 100 - v_identity_igblast_igh),
vkl_mut = ifelse(is.na(v_identity_igblast_igkl), NA_integer_, 100 - v_identity_igblast_igkl),
b_mut_class = ifelse(vh_mut > 2 | vkl_mut > 2, "mutated", "unmutated")) %>% #for the Ig data add the SHM & whether a cell's Ig is mutated (using 2% threshold)
mutate(isotype = ifelse(is.na(c_call_igblast_igh), NA_character_, str_replace(c_call_igblast_igh, "\\*.+$", ""))) #for the Ig data add the isotype subclass
#number of CELLs in expanded clones
with(b.data, table(dataset, b_expanded_cln))
## b_expanded_cln
## dataset expanded singleton
## pbmc 12 828
## tumour 182 4723
#number of cells that are mutated vs unmutated
with(b.data, table(dataset, b_mut_class))
## b_mut_class
## dataset mutated unmutated
## pbmc 169 665
## tumour 1145 3683
#number of cells for each isotype subclass
with(b.data, table(dataset, isotype))
## isotype
## dataset IGHA1 IGHA2 IGHD IGHE IGHG1 IGHG1,IGHG2 IGHG2 IGHG3 IGHG4 IGHM
## pbmc 30 7 12 0 32 0 15 5 0 731
## tumour 197 52 345 1 240 4 168 45 1 3713
For the T cells, removing the cells that don’t have the barcodes in GEX, and then adding paired clonotype label, clone size and whether or not the cell is part of a clonal expansion:
#add additional fields to the t cell data
t.data <- t.data %>%
filter(in_gex) %>% #removing cells that aren't in the GEX
mutate(tcr_paired = ifelse( !(is.na(clonotype_tra)) & !(is.na(clonotype_trb)), TRUE, FALSE)) %>% #add a column to indicate if the t cells include paired chains
mutate(paired_clonotype = case_when(is.na(clonotype_tra) & !(is.na(clonotype_trb)) ~ paste(clonotype_trb),
!(is.na(clonotype_tra)) & is.na(clonotype_trb) ~ paste(clonotype_tra),
!(is.na(clonotype_tra)) & !(is.na(clonotype_trb)) ~ paste(clonotype_trb, clonotype_tra, sep = ":"),
TRUE ~ NA_character_)) %>% #for the t cell add a clonotype label that joins both the TRA and TRB, the clone clustering for the Ig already considered the IGH and IGKL
group_by(dataset, paired_clonotype) %>% #group cells by their paired clonotype labels to add cols for # cells in each clonotype and whether clonotype is expanded
mutate(cln_size = length(barcode)) %>% #add the size for each clone, using mutate rather than summarise
ungroup() %>% #remove the grouping
mutate(t_expanded_cln = ifelse(cln_size == 1, "singleton", "expanded")) %>% #add a column indicating if the clone is expanded based on the cell cnt for the clone
mutate(isotype = ifelse(is.na(c_call_igblast_trb), NA_character_, str_replace(c_call_igblast_trb, "\\*.+$", ""))) #adding the TRB constant region type, not actually isotype but for plotting same col name
#number of CELLs in expanded clones
with(t.data, table(dataset, t_expanded_cln))
## t_expanded_cln
## dataset expanded singleton
## pbmc 125 4943
## tumour 45 1661
with(t.data, table(dataset, isotype))
## isotype
## dataset TRBC1 TRBC2
## pbmc 2197 2787
## tumour 602 1050
Ranks for the expanded clones/clonotypes can also be added (where #1 is the largest clone/clonotype in the dataset):
## adding a rank for the clones/clonotypes
b.cln.rnks <- b.data %>%
filter(!(is.na(unq_clone_id))) %>%
group_by(dataset, unq_clone_id) %>%
summarise(cln_size = length(barcode)) %>%
ungroup() %>%
mutate(expanded = ifelse(cln_size > 1, TRUE, FALSE)) %>%
filter(expanded) %>%
group_by(dataset) %>%
mutate(b_cln_rnk = min_rank(desc(cln_size))) %>%
ungroup() %>%
arrange(b_cln_rnk)
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
b.data <- b.data %>%
left_join(b.cln.rnks %>% select(dataset, unq_clone_id, b_cln_rnk),
by = c("unq_clone_id", "dataset"))
rm(b.cln.rnks)
t.cln.rnks <- t.data %>%
filter(!(is.na(paired_clonotype))) %>%
group_by(dataset, paired_clonotype) %>%
summarise(cln_size = length(barcode)) %>%
ungroup() %>%
mutate(expanded = ifelse(cln_size > 1, TRUE, FALSE)) %>%
filter(expanded) %>%
group_by(dataset) %>%
mutate(t_cln_rnk = min_rank(desc(cln_size))) %>%
ungroup() %>%
arrange(t_cln_rnk)
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
t.data <- t.data %>%
left_join(t.cln.rnks %>% select(dataset, paired_clonotype, t_cln_rnk),
by = c("paired_clonotype", "dataset"))
rm(t.cln.rnks)
The B and T cell data can now be formatted to add to the Seurat objects for the scRNA-seq analysis. The join will happen via the cell barcode and here we are selecting the subset of columns that want to add into the Seurat meta.data. This list can be amended as needed:
## this can be altered
b.metadata <- b.data %>%
select(barcode, dataset, v_igh, j_igh, v_igkl, j_igkl, unq_clone_id, bcr_paired, b_cln_rnk, b_expanded_cln,
vh_mut, vkl_mut, b_mut_class, isotype)
t.metadata <- t.data %>%
select(barcode, dataset, v_tra, j_tra, v_trb, j_trb, paired_clonotype, tcr_paired, t_cln_rnk, t_expanded_cln, isotype)
To add the metadata need to convert from a tibble to a
data.frame and move the barcode to the row names. We also
need to split the PBMC and tumour to separate data frames:
##PBMC
#create a data.frame from combining the B and T cell data for the PBMC dataset
#adding an extra column after bring the data together to indicate whether a cell has B VDJ, T VDJ or neither
pbmc.metadata <- as.data.frame(b.metadata %>% filter(dataset == "pbmc") %>%
full_join(t.metadata %>% filter(dataset == "pbmc")) %>%
mutate(vdj_cell_type = case_when(!(is.na(paired_clonotype)) ~ "T",
!(is.na(unq_clone_id)) ~ "B",
TRUE ~ NA_character_)))
## Joining with `by = join_by(barcode, dataset, isotype)`
#add the barcodes as the row names for the data.frame
rownames(pbmc.metadata) <- pbmc.metadata$barcode
#remove the barcode column
pbmc.metadata <- pbmc.metadata %>% select(-barcode)
Adding PBMC metadata:
#add the metadata to the seurat object, as the rownames are the cell barcodes this will ensure everything gets matched up!
pbmc.seurat.obj <- AddMetaData(pbmc.seurat.obj, pbmc.metadata)
#check the meta data cols now in the seurat object
colnames(pbmc.seurat.obj@meta.data)
## [1] "orig.ident" "nCount_RNA" "nFeature_RNA" "nCount_SCT"
## [5] "nFeature_SCT" "SCT_snn_res.0.8" "seurat_clusters" "SCT_snn_res.0.7"
## [9] "SCT_snn_res.0.6" "SCT_snn_res.0.5" "SCT_snn_res.0.4" "SCT_snn_res.0.3"
## [13] "SCT_snn_res.0.2" "azimuth_l1" "azimuth_l2" "azimuth_l3"
## [17] "dataset" "v_igh" "j_igh" "v_igkl"
## [21] "j_igkl" "unq_clone_id" "bcr_paired" "b_cln_rnk"
## [25] "b_expanded_cln" "vh_mut" "vkl_mut" "b_mut_class"
## [29] "isotype" "v_tra" "j_tra" "v_trb"
## [33] "j_trb" "paired_clonotype" "tcr_paired" "t_cln_rnk"
## [37] "t_expanded_cln" "vdj_cell_type"
#can remove the data frame once it is in the seurat object
rm(list = c("pbmc.metadata"))
Repeating for the tumour dataset:
##tumour
#create a data.frame from B and T data for the tumour dataset
tumour.metadata <- as.data.frame(b.metadata %>% filter(dataset == "tumour") %>%
full_join(t.metadata %>% filter(dataset == "tumour")) %>%
mutate(vdj_cell_type = case_when(!(is.na(paired_clonotype)) ~ "T",
!(is.na(unq_clone_id)) ~ "B",
TRUE ~ NA_character_)))
## Joining with `by = join_by(barcode, dataset, isotype)`
#add the barcodes as the row names for the data.frame
rownames(tumour.metadata) <- tumour.metadata$barcode
#remove the barcode column
tumour.metadata <- tumour.metadata %>% select(-barcode)
#add the metadata to the seurat object, as the rownames are the cell barcodes this will ensure everything gets matched up!
tumour.seurat.obj <- AddMetaData(tumour.seurat.obj, tumour.metadata)
#check the meta data cols now in the seurat object
colnames(tumour.seurat.obj@meta.data)
## [1] "orig.ident" "nCount_RNA" "nFeature_RNA" "nCount_SCT"
## [5] "nFeature_SCT" "SCT_snn_res.0.8" "seurat_clusters" "SCT_snn_res.0.7"
## [9] "SCT_snn_res.0.6" "SCT_snn_res.0.5" "SCT_snn_res.0.4" "SCT_snn_res.0.3"
## [13] "SCT_snn_res.0.2" "azimuth_l1" "azimuth_l2" "azimuth_l3"
## [17] "dataset" "v_igh" "j_igh" "v_igkl"
## [21] "j_igkl" "unq_clone_id" "bcr_paired" "b_cln_rnk"
## [25] "b_expanded_cln" "vh_mut" "vkl_mut" "b_mut_class"
## [29] "isotype" "v_tra" "j_tra" "v_trb"
## [33] "j_trb" "paired_clonotype" "tcr_paired" "t_cln_rnk"
## [37] "t_expanded_cln" "vdj_cell_type"
#remove objects that we no longer require
rm(list = c("tumour.metadata", "b.metadata", "t.metadata"))
Now that the VDJ features are in the meta.data slot for seurat objects they can be used for plotting.
For example, highlighting which cells have B or T VDJs on a UMAP:
#plot the metadata onto the umap within Seurat
p1 <- DimPlot(pbmc.seurat.obj, group.by = "vdj_cell_type", cols = pal_d3()(2), na.value = "lightgray")
plot(p1)
Or, plotting which cells are using different isotype subclasses or TRBC genes:
p1 <- DimPlot(pbmc.seurat.obj, group.by = "isotype", cols = pal_d3("category20")(10), na.value = "lightgray")
plot(p1)
Plotting which clones are expanded versus singleton:
p1 <- DimPlot(subset(pbmc.seurat.obj, vdj_cell_type == "B"),
group.by = "b_expanded_cln",
cols = pal_d3("category20")(10),
pt.size = 0.8,
na.value = "lightgray")
## Warning: Removing 3466 cells missing data for vars requested
plot(p1)
Plotting B cell clones based on their clone rank:
p1 <- DimPlot(subset(pbmc.seurat.obj, vdj_cell_type == "B"),
group.by = c("b_cln_rnk"),
cols = pal_d3("category20")(10),
pt.size = 0.8,
na.value = "lightgray")
## Warning: Removing 3466 cells missing data for vars requested
plot(p1)
Some examples for the tumour dataset:
#tumour
p1 <- DimPlot(tumour.seurat.obj, group.by = "vdj_cell_type", cols = pal_d3()(2), na.value = "lightgray")
plot(p1)
p1 <- DimPlot(tumour.seurat.obj, group.by = "isotype", cols = pal_d3("category20")(20), na.value = "lightgray")
plot(p1)
The use of the VDJ meta data is not limited to UMAPS. It can also be used for other plots types.
For example, we can look at gene expression of CD3 for each seurat cluster for T cell clonotypes that are either expanded or singleton:
#can use the VDJ data when exploring GEX too
p1 <- VlnPlot(tumour.seurat.obj, group.by = "seurat_clusters", split.by = "t_expanded_cln", features = c("CD3E"))
## The default behaviour of split.by has changed.
## Separate violin plots are now plotted side-by-side.
## To restore the old behaviour of a single split violin,
## set split.plot = TRUE.
##
## This message will be shown once per session.
## Warning: Removing 5795 cells missing data for vars requested
plot(p1)
Or, showing relative gene expression per cell for CD3 for on a feature plot:
p1 <- FeaturePlot(tumour.seurat.obj, split.by = "t_expanded_cln", features = c("CD3E"))
plot(p1)
## Warning: Removed 5795 rows containing missing values (`geom_point()`).
## Removed 5795 rows containing missing values (`geom_point()`).
#remove the plot object and run garbage collection
rm(p1)
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4427651 236.5 7495724 400.4 NA 7495724 400.4
## Vcells 286010982 2182.1 415452225 3169.7 102400 325964872 2487.0
Plotting within Seurat can lack some flexibility, so in addition to
importing data to the Seurat object, we can also export data from Seurat
objects for use with tidyverse (or base R) functions. In this example,
we will export data from the meta.data and the UMAP coordinates slots,
but you can also export the normalised gene expression data using the
GetAssayData() function.
For the PBMC sample:
## PBMCs
#metadata slot
pbmc.seurat.meta <- as.data.frame(pbmc.seurat.obj@meta.data) %>%
rownames_to_column("barcode")
#UMAP coords
pbmc.seurat.umap <- as.data.frame(pbmc.seurat.obj@reductions$umap@cell.embeddings) %>%
rownames_to_column("barcode")
#combining
pbmc.seurat.data <- pbmc.seurat.meta %>%
left_join(pbmc.seurat.umap, by = "barcode") %>%
as_tibble()
pbmc.seurat.data %>% head()
## # A tibble: 6 × 41
## barcode orig.ident nCount_RNA nFeature_RNA nCount_SCT nFeature_SCT
## <chr> <fct> <dbl> <int> <dbl> <int>
## 1 AAACCTGAGACAGACC-1 SeuratProj… 4751 1982 5264 1959
## 2 AAACCTGAGCGATAGC-1 SeuratProj… 5750 1759 5735 1745
## 3 AAACCTGAGCGGCTTC-1 SeuratProj… 4991 1957 5384 1939
## 4 AAACCTGAGGATCGCA-1 SeuratProj… 6011 2309 5898 2290
## 5 AAACCTGAGTCACGCC-1 SeuratProj… 7249 2312 6248 2295
## 6 AAACCTGCACGTCAGC-1 SeuratProj… 5660 1946 5687 1932
## # ℹ 35 more variables: SCT_snn_res.0.8 <fct>, seurat_clusters <fct>,
## # SCT_snn_res.0.7 <fct>, SCT_snn_res.0.6 <fct>, SCT_snn_res.0.5 <fct>,
## # SCT_snn_res.0.4 <fct>, SCT_snn_res.0.3 <fct>, SCT_snn_res.0.2 <fct>,
## # azimuth_l1 <chr>, azimuth_l2 <chr>, azimuth_l3 <chr>, dataset <chr>,
## # v_igh <chr>, j_igh <chr>, v_igkl <chr>, j_igkl <chr>, unq_clone_id <chr>,
## # bcr_paired <lgl>, b_cln_rnk <int>, b_expanded_cln <chr>, vh_mut <dbl>,
## # vkl_mut <dbl>, b_mut_class <chr>, isotype <chr>, v_tra <chr>, …
rm(list = c("pbmc.seurat.meta", "pbmc.seurat.umap"))
For the tumour sample:
## Tumour
tumour.seurat.meta <- as.data.frame(tumour.seurat.obj@meta.data) %>%
rownames_to_column("barcode")
tumour.seurat.umap <- as.data.frame(tumour.seurat.obj@reductions$umap@cell.embeddings) %>%
rownames_to_column("barcode")
tumour.seurat.data <- tumour.seurat.meta %>%
left_join(tumour.seurat.umap, by = "barcode") %>%
as_tibble()
tumour.seurat.data %>% head()
## # A tibble: 6 × 41
## barcode orig.ident nCount_RNA nFeature_RNA nCount_SCT nFeature_SCT
## <chr> <fct> <dbl> <int> <dbl> <int>
## 1 AAACCTGAGATCCTGT-1 SeuratProj… 5121 1648 6232 1631
## 2 AAACCTGAGCGATAGC-1 SeuratProj… 7212 1918 6915 1906
## 3 AAACCTGAGGAGTCTG-1 SeuratProj… 5903 1945 6401 1940
## 4 AAACCTGCAGCGTAAG-1 SeuratProj… 7226 2348 6927 2328
## 5 AAACCTGCAGTATAAG-1 SeuratProj… 14042 3363 7489 2861
## 6 AAACCTGCATCTGGTA-1 SeuratProj… 23428 5020 6760 2422
## # ℹ 35 more variables: SCT_snn_res.0.8 <fct>, seurat_clusters <fct>,
## # SCT_snn_res.0.7 <fct>, SCT_snn_res.0.6 <fct>, SCT_snn_res.0.5 <fct>,
## # SCT_snn_res.0.4 <fct>, SCT_snn_res.0.3 <fct>, SCT_snn_res.0.2 <fct>,
## # azimuth_l1 <chr>, azimuth_l2 <chr>, azimuth_l3 <chr>, dataset <chr>,
## # v_igh <chr>, j_igh <chr>, v_igkl <chr>, j_igkl <chr>, unq_clone_id <chr>,
## # bcr_paired <lgl>, b_cln_rnk <int>, b_expanded_cln <chr>, vh_mut <dbl>,
## # vkl_mut <dbl>, b_mut_class <chr>, isotype <chr>, v_tra <chr>, …
#remove the seurat objects now that the necessary data has been dumped
rm(list = c("tumour.seurat.meta", "tumour.seurat.umap",
"tumour.seurat.obj", "pbmc.seurat.obj"))
#force garbage collection
gc()
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 4404008 235.2 7495724 400.4 NA 7495724 400.4
## Vcells 17012281 129.8 332361780 2535.8 102400 325964872 2487.0
We now have greater control over the plotting and the format of the plots and as a bounus we have freed up a lot of memory by unloading the Seurat objects.
An example of plotting SHM for each B cell on a UMAP plot:
#an example of plotting B cell SHM levels on the UMAP
pbmc.shm.umap.p1 <- ggplot(pbmc.seurat.data, aes(x = umap_1, y = umap_2)) +
geom_point(data = pbmc.seurat.data %>% filter(vdj_cell_type != "B"),
size = 0.5,
colour = "lightgray") + #plotting cells that are NOT classed as B cell from VDJ on the bottom layer of the plot with a smaller point size
geom_point(data = pbmc.seurat.data %>% filter(vdj_cell_type == "B"),
size = 1,
aes(colour = vh_mut)) + #plotting cells that are classed as B cells on top layer and colouring by IGH SHM level
scale_color_viridis_c(name = "VH SHM %") + #using the continous viridis for the colour
theme_custom() +
labs(x = "UMAP_1",
y = "UMAP_2",
title = "PBMC IGHV SHM",
caption = "non B cells in gray") #labels for plot
plot(pbmc.shm.umap.p1)
tumour.shm.umap.p1 <- ggplot(tumour.seurat.data, aes(x = umap_1, y = umap_2)) +
geom_point(data = tumour.seurat.data %>% filter(vdj_cell_type != "B"),
size = 0.5,
colour = "lightgray") + #plotting cells that are NOT classed as B cell from VDJ on the bottom layer of the plot with a smaller point size
geom_point(data = tumour.seurat.data %>% filter(vdj_cell_type == "B"),
size = 1,
aes(colour = vh_mut)) + #plotting cells that are classed as B cells on top layer and colouring by IGH SHM level
scale_color_viridis_c(name = "VH SHM %") + #using the continuous viridis for the colour
theme_custom() +
labs(x = "UMAP_1",
y = "UMAP_2",
title = "tumour IGHV SHM",
caption = "non B cells in gray") #labels for plot
plot(tumour.shm.umap.p1)
Splitting plots across sub-panels is simple using the
facet_wrap() or facet_grid() from
ggplot2:
#only plotting the B cells, but splitting by isotype
pbmc.shm.umap.p2 <- ggplot(pbmc.seurat.data %>% filter(vdj_cell_type == "B"), aes(x = umap_1, y = umap_2)) +
geom_point(size = 1,
aes(colour = vh_mut)) +
scale_color_viridis_c(name = "VH SHM %") + #using the continuous viridis for the colour
theme_custom() +
labs(x = "UMAP_1",
y = "UMAP_2",
title = "PBMC IGHV SHM",
caption = "only showing cells with Ig VDJ") + #labels for plot
scale_y_continuous(limits = c(-20, 15)) +
scale_x_continuous(limits = c(-15, 10)) +
facet_wrap(~ isotype, nrow = 2, scales = "free") #splitting by the isotype subclass
#plot(pbmc.shm.umap.p2)
tumour.shm.umap.p2 <- ggplot(tumour.seurat.data %>% filter(vdj_cell_type == "B"), aes(x = umap_1, y = umap_2)) +
geom_point(size = 1,
aes(colour = vh_mut)) +
scale_color_viridis_c(name = "VH SHM %") + #using the continuous viridis for the colour
theme_custom() +
labs(x = "UMAP_1",
y = "UMAP_2",
title = "tumour IGHV SHM",
caption = "only showing cells with Ig VDJ") + #labels for plot
scale_y_continuous(limits = c(-20, 15)) +
scale_x_continuous(limits = c(-15, 25)) +
facet_wrap(~ isotype, nrow = 2, scales = "free") #splitting by the isotype subclass
#plot(tumour.shm.umap.p2)
plot(ggarrange(pbmc.shm.umap.p2, tumour.shm.umap.p2, nrow = 2))
We can build more complex panels using ggplot2 and
ggpubr. For example, visualising the SHM of the B cells on
UMAPs and displaying with a quantification of the SHM% for clones for
each isotype with statistics.
Start with a summary of the SHM for each cell for each isotype in the two datasets:
## isotype order by chromosome location
isotype_order <- c("IGHM", "IGHD", "IGHG3", "IGHG1", "IGHA1", "IGHG2", "IGHG4", "IGHE", "IGHA2")
##combining the datasets and adding the isotype levels
seurat.data <- bind_rows(pbmc.seurat.data %>% mutate(dataset = "PBMC"),
tumour.seurat.data %>% mutate(dataset = "tumour")) %>%
mutate(isotype_fac = factor(isotype, isotype_order))
#SHM summary
shm.summary.p1 <- ggplot(seurat.data %>% filter(vdj_cell_type == "B"),
aes(x = isotype_fac, y = vh_mut)) +
geom_boxplot(aes(group = interaction(dataset, isotype_fac)),
position = position_dodge(width = 0.9)) +
geom_point(aes(colour = dataset, group = dataset),
position = position_dodge(width = 0.9)) +
theme_x_rotated() +
labs(x = "isotype subclass",
y = "VH SHM %") +
scale_colour_aaas() +
guides(colour = guide_legend(override.aes = list(size = 5)))
plot(shm.summary.p1)
Adding p-values to determine if there is any difference in the mean
SHM between the PBMC and tumour B cells for the different isotypes. This
can be done with the rstatix package which offers a variety
of different tests and then formats the output for plotting on
ggplot2 with ggpubr functions. This call is a
little more complex because of how the data is grouped:
#adding stats for difference between isotype SHM means
shm.summary.stats <- seurat.data %>%
filter(vdj_cell_type == "B") %>% #only using data from B cells by VDJ
filter(isotype_fac %in% c("IGHM", "IGHD", "IGHG3", "IGHG1", "IGHA1", "IGHG2", "IGHA2")) %>% #not all isotype have necessary data points, so restrict
group_by(isotype_fac) %>% #want to perform the tests for each isotype
wilcox_test(vh_mut ~ dataset) %>% #do th wilcoxon test for difference in means between the datasets
ungroup() %>% #remove the grouping by isotype
adjust_pvalue(method = "bonferroni") %>% # do the p-value adjustment
add_xy_position(x = "isotype_fac", dodge = 0.9) %>% #add the X & y-positions that are needed for plotting
mutate(group1 = isotype_fac,
group2 = isotype_fac) #adjust groups to get values in correct position on panel
#print
shm.summary.stats %>% head()
## # A tibble: 6 × 14
## isotype_fac .y. group1 group2 n1 n2 statistic p p.adj
## <fct> <chr> <fct> <fct> <int> <int> <dbl> <dbl> <dbl>
## 1 IGHM vh_mut IGHM IGHM 731 3713 1355168. 0.927 1
## 2 IGHD vh_mut IGHD IGHD 12 345 2072. 0.99 1
## 3 IGHG3 vh_mut IGHG3 IGHG3 5 45 133 0.518 1
## 4 IGHG1 vh_mut IGHG1 IGHG1 32 240 4992. 0.00585 0.0410
## 5 IGHA1 vh_mut IGHA1 IGHA1 30 197 3514 0.0956 0.669
## 6 IGHG2 vh_mut IGHG2 IGHG2 15 168 1394. 0.495 1
## # ℹ 5 more variables: y.position <dbl>, groups <named list>, x <dbl>,
## # xmin <dbl>, xmax <dbl>
Can now add the p-values to the panel using the
stat_pvalue_manual() function by specifying the tibble that
contains out test results (have adjusted to only display p-values that
are significant):
#add these to the panel using ggpubr function stat_pvalue_manual
shm.summary.p2 <- ggplot(seurat.data %>% filter(vdj_cell_type == "B"),
aes(x = isotype_fac, y = vh_mut)) +
geom_boxplot(aes(group = interaction(dataset, isotype_fac)),
position = position_dodge(width = 0.9)) +
geom_point(aes(colour = dataset, group = dataset),
position = position_dodge(width = 0.9)) +
stat_pvalue_manual(data = shm.summary.stats, label.size = 3,
hide.ns = TRUE) +
theme_x_rotated() +
labs(x = "isotype subclass",
y = "VH SHM %",
caption = "p-values: Wilcoxon, bonf adjusted") +
scale_colour_aaas() +
guides(colour = guide_legend(override.aes = list(size = 5)))
plot(shm.summary.p2)
Finally, can use ggarrange() from ggpubr to
combine the panels into a single figure panel:
plot(ggarrange(ggarrange(pbmc.shm.umap.p1, tumour.shm.umap.p1, nrow = 1),
shm.summary.p2, nrow = 2, ncol = 1))
Exploring the extent of clone/clonotype expansion in the different datasets by building up a figure panel. Starting with a UMAP showing the expanded and singleton clones for the samples:
exp.umap.p1 <- ggplot(seurat.data, aes(x = umap_1, y = umap_2)) +
geom_point(data = seurat.data %>% filter(is.na(vdj_cell_type)),
size = 0.5, colour = "lightgray") + #bottom layer are the cell that don't have any VDJ
geom_point(data = seurat.data %>% filter(vdj_cell_type == "B", b_expanded_cln != "expanded"),
size = 0.8,
aes(colour = b_expanded_cln)) + #B cells, singleton
geom_point(data = seurat.data %>% filter(vdj_cell_type == "B", b_expanded_cln == "expanded"),
size = 0.8,
aes(colour = b_expanded_cln)) + #B cells, expanded
geom_point(data = seurat.data %>% filter(vdj_cell_type == "T", t_expanded_cln != "expanded"),
aes(colour = t_expanded_cln),
size = 0.8) + #T cells, singleton
geom_point(data = seurat.data %>% filter(vdj_cell_type == "T", t_expanded_cln == "expanded"),
aes(colour = t_expanded_cln),
size = 0.8) + #T cells, expanded
theme_custom() +
facet_grid(dataset ~ vdj_cell_type) +
scale_colour_npg(name = "expansions") +
guides(colour = guide_legend(override.aes = list(size = 5)))
#exp.umap.p1
Next make stacked barplot to summarise the proportion of clones that are expanded for B cells in each sample, along with the % of cells that are in expansions versus singleton clones:
##summarise
b.expanded.clns.summary <- b.data %>%
group_by(dataset, b_expanded_cln) %>%
summarise(cell_cnt = length(barcode),
cln_cnt = length(unique(unq_clone_id))) %>%
ungroup() %>%
group_by(dataset) %>%
mutate(cell_prop = cell_cnt / sum(cell_cnt),
cln_prop = cln_cnt / sum(cln_cnt)) %>%
ungroup()
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
b.expanded.clns.summary %>% head()
## # A tibble: 4 × 6
## dataset b_expanded_cln cell_cnt cln_cnt cell_prop cln_prop
## <chr> <chr> <int> <int> <dbl> <dbl>
## 1 pbmc expanded 12 3 0.0143 0.00361
## 2 pbmc singleton 828 828 0.986 0.996
## 3 tumour expanded 182 26 0.0371 0.00547
## 4 tumour singleton 4723 4723 0.963 0.995
b.exp.p1 <- ggplot(b.expanded.clns.summary, aes(x = dataset, y = cln_prop * 100)) +
geom_bar(stat = "identity", position = "stack",
aes(fill = b_expanded_cln)) +
scale_fill_npg(name = "expanded clone") +
theme_x_rotated() +
theme(legend.position = "bottom") +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
labs(y = "% clones")
#plot(b.exp.p1)
b.exp.p2 <- ggplot(b.expanded.clns.summary, aes(x = dataset, y = cell_prop * 100)) +
geom_bar(stat = "identity", position = "stack",
aes(fill = b_expanded_cln)) +
scale_fill_npg(name = "expanded clone") +
theme_x_rotated() +
theme(legend.position = "bottom") +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
labs(y = "% cells")
#plot(b.exp.p2)
plot(ggarrange(b.exp.p1, b.exp.p2, nrow = 1))
Summarising for the T cell clonotypes:
t.expanded.clns.summary <- t.data %>%
group_by(dataset, t_expanded_cln) %>%
summarise(cell_cnt = length(barcode),
cln_cnt = length(unique(paired_clonotype))) %>%
ungroup() %>%
group_by(dataset) %>%
mutate(cell_prop = cell_cnt / sum(cell_cnt),
cln_prop = cln_cnt / sum(cln_cnt)) %>%
ungroup()
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
t.expanded.clns.summary %>% head()
## # A tibble: 4 × 6
## dataset t_expanded_cln cell_cnt cln_cnt cell_prop cln_prop
## <chr> <chr> <int> <int> <dbl> <dbl>
## 1 pbmc expanded 125 46 0.0247 0.00922
## 2 pbmc singleton 4943 4943 0.975 0.991
## 3 tumour expanded 45 18 0.0264 0.0107
## 4 tumour singleton 1661 1661 0.974 0.989
t.exp.p1 <- ggplot(t.expanded.clns.summary, aes(x = dataset, y = cln_prop * 100)) +
geom_bar(stat = "identity", position = "stack",
aes(fill = t_expanded_cln)) +
scale_fill_npg(name = "expanded clone") +
theme_x_rotated() +
theme(legend.position = "bottom") +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
labs(y = "% clonotypes")
#plot(t.exp.p1)
t.exp.p2 <- ggplot(t.expanded.clns.summary, aes(x = dataset, y = cell_prop * 100)) +
geom_bar(stat = "identity", position = "stack",
aes(fill = t_expanded_cln)) +
scale_fill_npg(name = "expanded clone") +
theme_x_rotated() +
theme(legend.position = "bottom") +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
labs(y = "% cells")
#plot(t.exp.p2)
plot(ggarrange(t.exp.p1, t.exp.p2, nrow = 1))
We have the UMAP and the stacked bar plot to show the proportions of expanded and singletons, but within the expansions we want to get a visual on how many cells there are in each clone or clonotype. To do this we will make a stacked bar chart that shows the number of cells for each clone/clonotype.
Starting with the B cell expanded clones:
##stacked bar charts for the expanded clones
b.cln.size.summary <- seurat.data %>%
filter(vdj_cell_type == "B") %>%
mutate(label = ifelse(b_expanded_cln == "expanded", paste0(unq_clone_id), "singletons")) %>%
group_by(dataset, label) %>%
summarise(cell_cnt = length(barcode)) %>%
ungroup() %>%
group_by(dataset) %>%
mutate(cell_prop = cell_cnt / sum(cell_cnt)) %>%
ungroup()
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
b.cln.size.summary %>% head()
## # A tibble: 6 × 4
## dataset label cell_cnt cell_prop
## <chr> <chr> <int> <dbl>
## 1 PBMC pbmc_724 2 0.00240
## 2 PBMC pbmc_952 2 0.00240
## 3 PBMC singletons 828 0.995
## 4 tumour singletons 4723 0.988
## 5 tumour tumour_1032 2 0.000418
## 6 tumour tumour_1039 2 0.000418
b.cln.sizes.p1 <- ggplot(b.cln.size.summary %>% filter(label != "singletons"),
aes(x = dataset, y = cell_cnt, group = cell_cnt)) +
geom_bar(stat = "identity",
position = position_stack(),
aes(fill = factor(label)),
linewidth = 0.05, colour = "gray") +
scale_fill_manual(values = colorRampPalette(pal_d3("category20")(20))(40),
name = "clone") +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
theme_x_rotated() +
theme(legend.position = "none") +
labs(y = "# cells")
plot(b.cln.sizes.p1)
For the T cell expanded clonotypes:
##stacked bar charts for the expanded clonotypes
t.cln.size.summary <- seurat.data %>%
filter(vdj_cell_type == "T") %>%
mutate(label = ifelse(t_expanded_cln == "expanded", paste0(paired_clonotype), "singletons")) %>%
group_by(dataset, label) %>%
summarise(cell_cnt = length(barcode)) %>%
ungroup() %>%
group_by(dataset) %>%
mutate(cell_prop = cell_cnt / sum(cell_cnt)) %>%
ungroup()
## `summarise()` has grouped output by 'dataset'. You can override using the
## `.groups` argument.
t.cln.size.summary %>% head()
## # A tibble: 6 × 4
## dataset label cell_cnt cell_prop
## <chr> <chr> <int> <dbl>
## 1 PBMC TRAV1-2_TRAJ33_AVMDSNYQLI 5 0.000987
## 2 PBMC TRBV12-4_TRBJ2-1_ASSLLFGLAGVYNEQF:TRAV9-2_TRAJ28_A… 2 0.000395
## 3 PBMC TRBV12-4_TRBJ2-3_ASSLALARSTDTQY 2 0.000395
## 4 PBMC TRBV12-4_TRBJ2-6_ASSPPRGSSGANVLT:TRAV21_TRAJ56_AVF… 2 0.000395
## 5 PBMC TRBV18_TRBJ2-1_ASSPPGANEQF:TRAV20_TRAJ4_AVPSFSGGYN… 3 0.000592
## 6 PBMC TRBV19_TRBJ1-5_ASSIKGGHQPQH:TRAV24_TRAJ45_AFIPGGGA… 3 0.000592
t.cln.sizes.p1 <- ggplot(t.cln.size.summary %>% filter(label != "singletons"),
aes(x = dataset, y = cell_cnt, group = cell_cnt)) +
geom_bar(stat = "identity",
position = position_stack(),
aes(fill = factor(label)),
linewidth = 0.05, colour = "gray") +
scale_fill_manual(values = colorRampPalette(pal_d3("category20")(20))(70),
name = "clone") +
scale_y_continuous(expand = expansion(mult = c(0, 0.05))) +
theme_x_rotated() +
theme(legend.position = "none") +
labs(y = "# cells")
plot(t.cln.sizes.p1)
Finally, build it all into a single panel to summarise the expansions across the two datasets:
## turning off the legends as will be provided by the UMAP but adding a title to each for cell type
b.exp.panel <- ggarrange(b.exp.p1, b.exp.p2, common.legend = TRUE, legend = "none")
b.exp.panel <- annotate_figure(b.exp.panel, top = text_grob("B cells", color = "black", face = "bold", size = 14))
t.exp.panel <- ggarrange(t.exp.p1, t.exp.p2, common.legend = TRUE, legend = "none")
t.exp.panel <- annotate_figure(t.exp.panel, top = text_grob("T cells", color = "black", face = "bold", size = 14))
b.cln.sizes.p1 <- annotate_figure(b.cln.sizes.p1, top = text_grob("B expanded", color = "black", face = "bold", size = 14))
t.cln.sizes.p1 <- annotate_figure(t.cln.sizes.p1, top = text_grob("T expanded", color = "black", face = "bold", size = 14))
#combining the bar plots and the UMAP with the stacked bar charts for clonotype distribution
plot(ggarrange(exp.umap.p1 + theme(legend.position = "top"),
ggarrange(b.exp.panel, t.exp.panel, nrow = 2),
ggarrange(b.cln.sizes.p1, t.cln.sizes.p1, nrow = 2),
nrow = 1, widths = c(4, 2, 1)))
One challenge when using data point fill/colour to encode a VDJ feature like SHM is that it becomes hard to track the clusters on the UMAP. Hull plots can be used to automate the generation of outlines for the clusters. To use hull plots need to load the ggforce package:
if (sum(grepl("ggforce", rownames(installed.packages()))) > 0) {
print("ggforce is already installed")
} else {
install.packages("ggforce")
install.packages("concaveman")
}
## [1] "ggforce is already installed"
library(ggforce)
For the hull plots to make reasonable representation of the clusters it is sometime necessary to exclude cells that are outliers from the main density of points for the cluster. This will vary between datasets, so may need to experiment with different values for outlier exclusions. Here using the median UMAP1 + UMAP2 coordinates for clusters and a +/- of 2.2:
clst.outliers <- seurat.data %>%
group_by(dataset, seurat_clusters) %>%
mutate(median_umap_1 = median(umap_1),
median_umap_2 = median(umap_2)) %>%
ungroup() %>%
mutate(cluster_outlier = case_when((umap_1 < (median_umap_1 - 2.2)) | (umap_1 > (median_umap_1 + 2.2)) ~ TRUE,
(umap_2 < (median_umap_2 - 2.2)) | (`umap_2` > (median_umap_2 + 2.2)) ~ TRUE,
TRUE ~ FALSE))
Here is the UMAP with the data points coloured by Seurat cluster:
umap.clst.p1 <- ggplot(seurat.data, aes(x = umap_1, y = umap_2, color = factor(seurat_clusters))) +
geom_point(size = 0.2) +
scale_color_d3(palette = "category20",
name = "") +
theme_custom() +
facet_wrap(~ dataset) +
guides(colour = guide_legend(override.aes = list(size = 5)))
umap.clst.p1
The result of adding cluster outlines using the hull plots:
##points + hull plot for cluster regions
umap.clst.p2 <- ggplot(seurat.data, aes(x = umap_1, y = umap_2, color = factor(seurat_clusters))) +
geom_point(size = 0.4) +
geom_mark_hull(data = clst.outliers %>% filter(!cluster_outlier),
aes(fill = factor(seurat_clusters)), concavity = 3, expand = 0.01, radius = 0.01) +
scale_color_d3(palette = "category20",
name = "") +
scale_fill_d3(palette = "category20",
name = "") +
theme_custom() +
facet_wrap(~ dataset) +
guides(colour = guide_legend(override.aes = list(size = 5)))
#umap.clst.p2
plot(ggarrange(umap.clst.p1, umap.clst.p2, nrow = 2))
Keeping the hull outline, but showing expanded clones using the data point colouring:
p1 <- ggplot(seurat.data, aes(x = umap_1, y = umap_2)) +
geom_point(size = 0.8, shape = 21,
aes(fill = vh_mut),
stroke = NA) +
geom_mark_hull(data = clst.outliers %>% filter(!cluster_outlier),
aes(colour = factor(seurat_clusters)),
concavity = 3, expand = 0.01, radius = 0.01,
linetype = "dashed", linewidth = 1) +
scale_fill_viridis_c(name = "VH SHM%") +
scale_colour_d3(palette = "category20",
name = "Cluster") +
theme_custom() +
facet_wrap(~ dataset) +
guides(colour = guide_legend(override.aes = list(size = 5)))
plot(p1)
As the B and T cell data have been filtered and updated save new versions:
#save the tibble that include the seurat with VDJ
write_tsv(seurat.data, "../data/pbmc-tumour_seruat_data.tsv")
##save the b and t cell data that has the additional columns and filtering
write_tsv(b.data, "../data/pbmc-tumour_b_cells_vdj.tsv")
write_tsv(t.data, "../data/pbmc-tumour_t_cells_vdj.tsv")
Cell Ranger
VDJIf you prefer not to post-process data and to use the
Cell Ranger outputs directly here is how to add this
meta.data to a Suerat object.
The Seurat objects have been unloaded, so need to reload:
pbmc.seurat.obj <- readRDS("../seurat_objects/pbmc_seurat-without-VDJ-genes-azimuth.rds")
Grab the filtered_contig_annotations.csv file from
Cell Ranger:
pbmc.b.cellranger <- read_csv("https://cf.10xgenomics.com/samples/cell-vdj/5.0.0/sc5p_v2_hs_PBMC_10k/sc5p_v2_hs_PBMC_10k_b_filtered_contig_annotations.csv")
## Rows: 2066 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): barcode, contig_id, chain, v_gene, d_gene, j_gene, c_gene, cdr3, c...
## dbl (3): length, reads, umis
## lgl (4): is_cell, high_confidence, full_length, productive
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
pbmc.t.cellranger <- read_csv("https://cf.10xgenomics.com/samples/cell-vdj/5.0.0/sc5p_v2_hs_PBMC_10k/sc5p_v2_hs_PBMC_10k_t_filtered_contig_annotations.csv")
## Rows: 10817 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (11): barcode, contig_id, chain, v_gene, d_gene, j_gene, c_gene, cdr3, c...
## dbl (3): length, reads, umis
## lgl (4): is_cell, high_confidence, full_length, productive
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
The Cell Ranger data is per contig, need to change to
per cell barcode before adding to Seurat object. To do this, going to
filter T and B for contigs to the top chains by UMI for IGH + IGK/L and
TRA + TRB, respectively:
pbmc.b.cellranger.reformat <- pbmc.b.cellranger %>%
filter(productive == TRUE) %>%
mutate(chain_grp = ifelse(chain == "IGH", "igh", "igkl")) %>%
group_by(barcode, chain_grp) %>%
mutate(chain_rnk = row_number(desc(umis))) %>%
ungroup() %>%
filter(chain_rnk == 1) %>%
pivot_wider(id_cols = c("barcode", "is_cell"),
names_from = "chain_grp",
values_from = -c("barcode", "is_cell", "chain_grp")) %>%
mutate(is_paired_ig = ifelse(!(is.na(v_gene_igh)) & !(is.na(v_gene_igkl)), TRUE, FALSE))
pbmc.t.cellranger.reformat <- pbmc.t.cellranger %>%
filter(productive == TRUE) %>%
mutate(chain_grp = ifelse(chain == "TRA", "tra", "trb")) %>%
group_by(barcode, chain_grp) %>%
mutate(chain_rnk = row_number(desc(umis))) %>%
ungroup() %>%
filter(chain_rnk == 1) %>%
pivot_wider(id_cols = c("barcode", "is_cell"),
names_from = "chain_grp",
values_from = -c("barcode", "is_cell", "chain_grp")) %>%
mutate(is_paired_tr = ifelse(!(is.na(v_gene_tra)) & !(is.na(v_gene_trb)), TRUE, FALSE))
Join the T and B cell data and remove any cells that have data for both B and T:
pbmc.cellranger.data <- pbmc.b.cellranger.reformat %>%
full_join(pbmc.t.cellranger.reformat, by = c("barcode", "is_cell")) %>%
mutate(dual_cell = ifelse(!(is.na(is_paired_ig)) & !(is.na(is_paired_tr)), TRUE, FALSE)) %>%
filter(dual_cell == FALSE) %>%
mutate(vdj_cell_type = case_when(is.na(is_paired_ig) ~ "T",
is.na(is_paired_tr) ~ "B"))
table(pbmc.cellranger.data$vdj_cell_type)
##
## B T
## 886 5232
Now, format as a data.frame with the cell barcodes as the row names:
pbmc.cellranger.dataframe <- as.data.frame(pbmc.cellranger.data)
rownames(pbmc.cellranger.dataframe) <- pbmc.cellranger.dataframe$barcode
pbmc.cellranger.dataframe <- pbmc.cellranger.dataframe %>% select(-c(barcode))
Finally, can add the metadata to the Seurat object:
pbmc.seurat.obj <- AddMetaData(pbmc.seurat.obj, pbmc.cellranger.dataframe)
colnames(pbmc.seurat.obj@meta.data)
## [1] "orig.ident" "nCount_RNA" "nFeature_RNA"
## [4] "nCount_SCT" "nFeature_SCT" "SCT_snn_res.0.8"
## [7] "seurat_clusters" "SCT_snn_res.0.7" "SCT_snn_res.0.6"
## [10] "SCT_snn_res.0.5" "SCT_snn_res.0.4" "SCT_snn_res.0.3"
## [13] "SCT_snn_res.0.2" "azimuth_l1" "azimuth_l2"
## [16] "azimuth_l3" "is_cell" "contig_id_igkl"
## [19] "contig_id_igh" "high_confidence_igkl" "high_confidence_igh"
## [22] "length_igkl" "length_igh" "chain_igkl"
## [25] "chain_igh" "v_gene_igkl" "v_gene_igh"
## [28] "d_gene_igkl" "d_gene_igh" "j_gene_igkl"
## [31] "j_gene_igh" "c_gene_igkl" "c_gene_igh"
## [34] "full_length_igkl" "full_length_igh" "productive_igkl"
## [37] "productive_igh" "cdr3_igkl" "cdr3_igh"
## [40] "cdr3_nt_igkl" "cdr3_nt_igh" "reads_igkl"
## [43] "reads_igh" "umis_igkl" "umis_igh"
## [46] "raw_clonotype_id_igkl" "raw_clonotype_id_igh" "raw_consensus_id_igkl"
## [49] "raw_consensus_id_igh" "chain_rnk_igkl" "chain_rnk_igh"
## [52] "is_paired_ig" "contig_id_tra" "contig_id_trb"
## [55] "high_confidence_tra" "high_confidence_trb" "length_tra"
## [58] "length_trb" "chain_tra" "chain_trb"
## [61] "v_gene_tra" "v_gene_trb" "d_gene_tra"
## [64] "d_gene_trb" "j_gene_tra" "j_gene_trb"
## [67] "c_gene_tra" "c_gene_trb" "full_length_tra"
## [70] "full_length_trb" "productive_tra" "productive_trb"
## [73] "cdr3_tra" "cdr3_trb" "cdr3_nt_tra"
## [76] "cdr3_nt_trb" "reads_tra" "reads_trb"
## [79] "umis_tra" "umis_trb" "raw_clonotype_id_tra"
## [82] "raw_clonotype_id_trb" "raw_consensus_id_tra" "raw_consensus_id_trb"
## [85] "chain_rnk_tra" "chain_rnk_trb" "is_paired_tr"
## [88] "dual_cell" "vdj_cell_type"
Now the VDJ data is available for use using the group.by
and split.by for many Seurat functions.